Crawling PTA & Berita

Crawling PTA & Berita#

!pip install builtwith
Collecting builtwith
  Downloading builtwith-1.3.4.tar.gz (34 kB)
  Preparing metadata (setup.py) ... ?25l?25hdone
Requirement already satisfied: six in /usr/local/lib/python3.12/dist-packages (from builtwith) (1.17.0)
Building wheels for collected packages: builtwith
  Building wheel for builtwith (setup.py) ... ?25l?25hdone
  Created wheel for builtwith: filename=builtwith-1.3.4-py3-none-any.whl size=36077 sha256=b6a543532b9a72bcc885e42ed132ed0b2752c2ea799c79cbd14b4e79b856140e
  Stored in directory: /root/.cache/pip/wheels/7f/2d/b2/606e3df914d4aeeab99c4a4e3e9a61673d2293c2e346db00c8
Successfully built builtwith
Installing collected packages: builtwith
Successfully installed builtwith-1.3.4

1. Crawling PTA#

import builtwith

# Analisis teknologi yang digunakan
res = builtwith.parse('https://pta.trunojoyo.ac.id')
print(res)
{'web-servers': ['Nginx'], 'javascript-frameworks': ['jQuery', 'jQuery UI']}

Crawling Berita#

import requests
from bs4 import BeautifulSoup
import time
import re
import string
import sys
import pandas as pd
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import random

# === Fungsi Tampilan Progress Bar ===
def print_progress_bar(prefix, current, total, length=30):
    """Tampilkan progress bar untuk memantau proses scraping."""
    percent = (current / total) * 100 if total > 0 else 0
    filled = int(length * current // total)
    bar = 'โ–ˆ' * filled + '-' * (length - filled)
    sys.stdout.write(f'\r{prefix} |{bar}| {percent:.1f}% ({current}/{total})')
    sys.stdout.flush()
    if current == total:
        sys.stdout.write('\n')

# === Konfigurasi Session dengan Retry ===
def get_session():
    session = requests.Session()
    retry_strategy = Retry(
        total=5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"],
        backoff_factor=1
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    session.mount("https://", adapter)
    return session

# === Ambil Judul & Konten Artikel ===
def get_article_content_and_title(session, url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    }
    try:
        r = session.get(url, headers=headers, timeout=15)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")

        title = get_article_title(soup)
        content_selectors = [
            "div.detail-konten", "div.news-detail__content", "div.itp_bodycontent",
            "div.content-text", "div.article-content", "div.text_area"
        ]

        paragraphs = []
        for selector in content_selectors:
            content_divs = soup.select(selector)
            if content_divs:
                for div in content_divs:
                    for p in div.find_all("p"):
                        text = p.get_text(strip=True)
                        if text and not text.lower().startswith("baca juga"):
                            paragraphs.append(text)
                if paragraphs:
                    break
        
        if not paragraphs:
            body_text = soup.find("article")
            if body_text:
                for p in body_text.find_all("p"):
                    text = p.get_text(strip=True)
                    if text and not text.lower().startswith("baca juga"):
                        paragraphs.append(text)

        content = " ".join(paragraphs)
        return title, content
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}", file=sys.stderr)
        return "Judul Tidak Ditemukan", ""

# === Ambil Judul dari Elemen HTML ===
def get_article_title(soup):
    title_tag = soup.find("h1", class_="detail-title")
    if title_tag:
        return title_tag.get_text(strip=True)
    
    title_tag = soup.find("h2", class_="media__title")
    if title_tag:
        return title_tag.get_text(strip=True)
        
    title_tag = soup.find("title")
    if title_tag:
        return title_tag.get_text(strip=True).replace(" - detiknews", "").replace(" - detikfinance", "")

    return "Judul Tidak Ditemukan"

# === Ekstrak ID dari URL ===
def extract_id(url):
    id_match_d = re.search(r"/d-(\d+)", url)
    if id_match_d:
        return id_match_d.group(1)
    id_match_end = re.search(r"-(\d+)$", url)
    if id_match_end:
        return id_match_end.group(1)
    id_match_middle = re.search(r"(\d+)\.html$", url)
    if id_match_middle:
        return id_match_middle.group(1)
    return None

# === Fungsi Utama Scraping ===
def berita(categories, pages_per_category=10, max_per_category=100):
    start_time = time.time()
    session = get_session()
    all_articles_data = []
    processed_links = set()

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    }

    base_urls = {
        "politik": "https://news.detik.com/indeks/berita/",
        "hukum": "https://news.detik.com/indeks/berita/",
        "ekonomi": "https://finance.detik.com/indeks/",
        "detikx": "https://news.detik.com/x/indeks/",
        "hiburan": "https://hot.detik.com/indeks/",
        "internasional": "https://news.detik.com/indeks/berita/",
        "sepakbola": "https://sport.detik.com/sepakbola/indeks/",
        "olahraga": "https://sport.detik.com/indeks/",
        "lingkungan": "https://www.detik.com/tag/lingkungan",
        "otomotif": "https://oto.detik.com/indeks"
    }

    for cat in categories:
        total_collected = 0
        print(f"\n๐ŸŒ Memulai crawling kategori: {cat}")
        
        for page_count in range(1, pages_per_category + 1):
            if total_collected >= max_per_category:
                break

            url = f"{base_urls.get(cat.lower(), f'https://{cat.lower()}.detik.com/indeks/')}"
            if cat.lower() == "lingkungan":
                url = f"https://www.detik.com/tag/lingkungan"
            url = f"{url}?page={page_count}"

            try:
                r = session.get(url, headers=headers, timeout=15)
                r.raise_for_status()
                soup = BeautifulSoup(r.text, "html.parser")
                article_links = soup.select("a.media__link")

                for a in article_links:
                    link = a.get("href")
                    if not link or link in processed_links:
                        continue
                    processed_links.add(link)

                    berita_id = extract_id(link)
                    title, content = get_article_content_and_title(session, link)

                    if content:
                        all_articles_data.append({
                            "id_berita": berita_id,
                            "judul_berita": title,
                            "isi_berita_original": content,
                            "kategori_berita": cat
                        })
                        total_collected += 1
                        print_progress_bar(f"{cat}", total_collected, max_per_category)

                    if total_collected >= max_per_category:
                        break

                    time.sleep(random.uniform(1, 2))

            except requests.exceptions.RequestException as e:
                print(f"\nโŒ Gagal mengakses {url}: {e}", file=sys.stderr)
                break

        print(f"\nโœ… Selesai kategori {cat} โ€” Total berita: {total_collected}")

    # === Simpan ke CSV ===
    df = pd.DataFrame(all_articles_data)
    df.to_csv("crawling_detik_berita.csv", index=False, encoding="utf-8-sig")

    # === Statistik Waktu ===
    end_time = time.time()
    elapsed = int(end_time - start_time)
    jam, sisa = divmod(elapsed, 3600)
    menit, detik = divmod(sisa, 60)

    print("\n๐ŸŽ‰ Semua kategori selesai di-scrape!")
    print(f"๐Ÿ“Š Total berita terkumpul: {len(df)}")
    print(f"โฑ๏ธ Durasi: {jam} jam {menit} menit {detik} detik")
    print("\nBerikut 5 data pertama:\n", df.head())

    return df

if __name__ == '__main__':
    categories = ["politik", "hukum", "ekonomi", "lingkungan", "hiburan", "internasional", "otomotif", "olahraga", "sepakbola"]
    berita(categories, pages_per_category=10, max_per_category=100)
๐ŸŒ Memulai crawling kategori: politik
politik |โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 100.0% (100/100)

โœ… Selesai kategori politik โ€” Total berita: 100

๐ŸŒ Memulai crawling kategori: hukum
hukum |โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ-| 99.0% (99/100)
โœ… Selesai kategori hukum โ€” Total berita: 99

๐ŸŒ Memulai crawling kategori: ekonomi
ekonomi |โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 100.0% (100/100)

โœ… Selesai kategori ekonomi โ€” Total berita: 100

๐ŸŒ Memulai crawling kategori: lingkungan

โœ… Selesai kategori lingkungan โ€” Total berita: 0

๐ŸŒ Memulai crawling kategori: hiburan
hiburan |โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 100.0% (100/100)

โœ… Selesai kategori hiburan โ€” Total berita: 100

๐ŸŒ Memulai crawling kategori: internasional
internasional |------------------------------| 2.0% (2/100)
โœ… Selesai kategori internasional โ€” Total berita: 2

๐ŸŒ Memulai crawling kategori: otomotif
otomotif |โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 100.0% (100/100)

โœ… Selesai kategori otomotif โ€” Total berita: 100

๐ŸŒ Memulai crawling kategori: olahraga
olahraga |โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 100.0% (100/100)

โœ… Selesai kategori olahraga โ€” Total berita: 100

๐ŸŒ Memulai crawling kategori: sepakbola
sepakbola |โ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆโ–ˆ| 100.0% (100/100)

โœ… Selesai kategori sepakbola โ€” Total berita: 100

๐ŸŽ‰ Semua kategori selesai di-scrape!
๐Ÿ“Š Total berita terkumpul: 701
โฑ๏ธ Durasi: 0 jam 30 menit 1 detik

Berikut 5 data pertama:
   id_berita                                       judul_berita  \
0   8158197  Pria di Bekasi Curi Kabel Rel KA, KAI Ungkap R...   
1   8158178  3 Pelaku Ditangkap Terkait 2 Orang di Kintaman...   
2   8158168  Bripda Aprilia Eka Raih Medali Emas di Uzbekis...   
3   8158166  DKI Gratiskan Layanan Angkut Sampah Besar, War...   
4   8158148  Video Call Terakhir Letda Fauzy dengan Ayah Se...   

                                 isi_berita_original kategori_berita  
0  Petugas pengamanan (PAM) PT Kereta Api Indones...         politik  
1  Polisi telah mengamankan tiga orang dari perke...         politik  
2  Bripda Aprilia Eka Putri Lumbantungkup menjuar...         politik  
3  Pemerintah Provinsi DKI Jakartakini menggratis...         politik  
4  Perwira muda TNI AD, Letda Inf Fauzy Ahmad Sul...         politik